from pymongo import MongoClient
from bs4 import BeautifulSoup
from pprint import pprint
import requests
import datetime
import json
import time
import re

server_client = MongoClient('127.0.0.1', 27017)

server_db = server_client['knx_posts_db']
offical_posts_coll = server_db['offical_posts_coll']


class POST():
    def __init__(self):
        self.scrapy()

    def scrapy(self):
        for i in range(1, 4):
            url = 'http://careers.longigroup.com/index.php?m=content&c=index&a=lists&catid=27&page=' + str(i)
            r = requests.get(url)
            r.encoding = 'utf-8'
            soup = BeautifulSoup(r.text)

            for jd in soup.select('.joblist-div .ul_2 .ul_3'):
                name = jd.find(class_ = 'zw_01').find('a').text
                location = jd.find(class_ = 'zw_03').text
                date = jd.find(class_ = 'zw_04').text

                url = jd.find(class_ = 'zw_01').find('a').get('href')
                r = requests.get(url)
                r.encoding = 'utf-8'
                detail_soup = BeautifulSoup(r.text)

                count = int(detail_soup.find(text = re.compile('招聘人数：')).parent.parent.get_text().split('：')[1])
                description = detail_soup.select('.zhaopin_wz')[0].get_text() + detail_soup.select('.zhaopin_wz')[1].get_text()

                item = {
                    "url": url,
                    'edu': '',
                    'exp': [],
                    'name': name,
                    'date': date,
                    'lang': '',
                    'place': '',
                    'major': '',
                    'count': count,
                    'salary': [],
                    'toSchool': True,
                    'welfare': [],
                    'funType': '',
                    'company': '隆基乐叶光伏科技有限公司',
                    'location': location,
                    'industry': '',
                    'keywords': [],
                    'platform': 'offical',
                    'searchKeyword': '',
                    'description': description,
                    'subIndustry': '',
                    'stime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }

                result = offical_posts_coll.replace_one({'company': '隆基乐叶光伏科技有限公司', 'name': name, 'location': location}, item, True)
                print(item['company'], item['name'])


POST()
